Homework 3¶

In [47]:
import dalex as dx
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

from keras.models import Sequential
from keras.layers import Dense
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
In [2]:
data = pd.read_csv('hotel_bookings.csv')
data.head()
Out[2]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.0 0 1 Check-Out 2015-07-03

5 rows × 32 columns

In [3]:
# this time all variables will be used as it was possible to use machine with better processor
data = data[['is_canceled', 'lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
            'previous_cancellations', 'is_repeated_guest', 'arrival_date_month', 'deposit_type', 'customer_type']]
categorical_features = ['arrival_date_month', 'deposit_type', 'customer_type']
numeric_features = ['lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
            'previous_cancellations', 'is_repeated_guest']
data = data.dropna()
X, y = data.loc[:, data.columns != 'is_canceled'], data[['is_canceled']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
In [4]:
observation = pd.DataFrame({'lead_time': [82.0],
                       'arrival_date_year': [2015.0],
                       'adults': [2.0],
                       'children': [0.0],
                       'babies': [0.0],
                       'booking_changes': [0.0],
                       'previous_cancellations': [0.0],
                       'is_repeated_guest': [0.0],
                       'arrival_date_month': ['July'],
                       'deposit_type': ['No Deposit'],
                       'customer_type': ['Transient']},
                      index = ['observation'])

Neural network¶

In [5]:
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val
In [32]:
categorical_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer(
   transformers=[
   ('categorical', categorical_transformer, categorical_features),
    ('numeric', numeric_transformer, numeric_features)
])
In [35]:
def keras_classifier_wrapper():
    model = Sequential()
    model.add(Dense(100, input_dim=X_train_nn.shape[1], activation='sigmoid'))
    model.add(Dense(50, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_metric])
    return model
In [39]:
model = KerasClassifier(keras_classifier_wrapper, epochs=100, batch_size=200)
callbacks = [EarlyStopping(monitor='f1_metric', mode='max', restore_best_weights=True, patience=10)]
nn = Pipeline(steps = [
               ('preprocessor', preprocessor),
               ('neural_network', model)
           ])
nn.fit(X_train, y_train, neural_network__callbacks=callbacks)
Epoch 1/100
538/538 [==============================] - 2s 3ms/step - loss: 0.5420 - accuracy: 0.7291 - f1_metric: 0.4596
Epoch 2/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4865 - accuracy: 0.7683 - f1_metric: 0.5776
Epoch 3/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4829 - accuracy: 0.7691 - f1_metric: 0.5777
Epoch 4/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4797 - accuracy: 0.7695 - f1_metric: 0.5779
Epoch 5/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4774 - accuracy: 0.7708 - f1_metric: 0.5814
Epoch 6/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4752 - accuracy: 0.7714 - f1_metric: 0.5844
Epoch 7/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4734 - accuracy: 0.7718 - f1_metric: 0.5873
Epoch 8/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4706 - accuracy: 0.7726 - f1_metric: 0.5912
Epoch 9/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4678 - accuracy: 0.7737 - f1_metric: 0.5971
Epoch 10/100
538/538 [==============================] - 1s 3ms/step - loss: 0.4663 - accuracy: 0.7727 - f1_metric: 0.5971
Epoch 11/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4647 - accuracy: 0.7732 - f1_metric: 0.6020
Epoch 12/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4634 - accuracy: 0.7729 - f1_metric: 0.6019
Epoch 13/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4626 - accuracy: 0.7729 - f1_metric: 0.6029
Epoch 14/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4621 - accuracy: 0.7734 - f1_metric: 0.6043
Epoch 15/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4610 - accuracy: 0.7731 - f1_metric: 0.6034
Epoch 16/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4603 - accuracy: 0.7731 - f1_metric: 0.6029
Epoch 17/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4593 - accuracy: 0.7736 - f1_metric: 0.6028
Epoch 18/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4584 - accuracy: 0.7738 - f1_metric: 0.6047
Epoch 19/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4582 - accuracy: 0.7743 - f1_metric: 0.6040
Epoch 20/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4568 - accuracy: 0.7750 - f1_metric: 0.6048
Epoch 21/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4563 - accuracy: 0.7744 - f1_metric: 0.6035
Epoch 22/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4557 - accuracy: 0.7747 - f1_metric: 0.6039
Epoch 23/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4551 - accuracy: 0.7748 - f1_metric: 0.6031
Epoch 24/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4539 - accuracy: 0.7753 - f1_metric: 0.6043
Epoch 25/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4536 - accuracy: 0.7753 - f1_metric: 0.6035
Epoch 26/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4529 - accuracy: 0.7751 - f1_metric: 0.6031
Epoch 27/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4524 - accuracy: 0.7756 - f1_metric: 0.6033
Epoch 28/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4518 - accuracy: 0.7753 - f1_metric: 0.6021
Epoch 29/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4512 - accuracy: 0.7764 - f1_metric: 0.6046
Epoch 30/100
538/538 [==============================] - 2s 3ms/step - loss: 0.4506 - accuracy: 0.7761 - f1_metric: 0.6040
Out[39]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  OneHotEncoder(),
                                                  ['arrival_date_month',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numeric', StandardScaler(),
                                                  ['lead_time',
                                                   'arrival_date_year',
                                                   'adults', 'children',
                                                   'babies', 'booking_changes',
                                                   'previous_cancellations',
                                                   'is_repeated_guest'])])),
                ('neural_network',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x000001E9A13FE3A0>)])
In [55]:
y_pred = np.where(np.transpose(nn.predict_proba(X_test))[0] > 0.5, 0,1)
print(f'f1-score: {f1_score(y_test, y_pred)}')
# parameters were not tuned, so results are not the best
f1-score: 0.6001794258373205
In [57]:
nn.predict_proba(observation) # prediction(there will be no cancelation)
Out[57]:
array([[0.72625995, 0.27374008]], dtype=float32)
In [58]:
exp_nn = dx.Explainer(nn, X_train, y_train, label='neural_network')
Preparation of a new explainer is initiated

  -> data              : 107447 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 107447 values
  -> model_class       : tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier (default)
  -> label             : neural_network
  -> predict function  : <function yhat_proba_default at 0x000001E98FA0F820> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00479, mean = 0.358, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.984, mean = 0.012, max = 0.992
  -> model_info        : package sklearn

A new explainer has been created!
In [59]:
nn_profile = exp_nn.predict_profile(new_observation = observation)

# plot Ceteris Paribus profile
nn_profile.plot(variables = ['lead_time', 'children', 'babies', 'booking_changes'])
# in this model chidren and babies variables did not matter at all
# bigger lead time(ariund 95 days) increased probability of cancelation, so did lead time > 400 days,
# when lead_time values are between those values or are smaller than 95 days probability of not coming drops
# number of children greater than 1 makes cancelation more possible(and 1 child makes it less possible)
# number of babies and booking changes behave same and show less probability of cancelation around value 1
# then proability grows a bit and then falls down
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 11/11 [00:01<00:00,  7.66it/s]

Random forest¶

In [61]:
forest = Pipeline(steps = [
               ('preprocessor', preprocessor),
               ('random_forest', RandomForestClassifier(max_depth=5, random_state=0))
           ])
forest.fit(X_train, y_train)
Out[61]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  OneHotEncoder(),
                                                  ['arrival_date_month',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numeric', StandardScaler(),
                                                  ['lead_time',
                                                   'arrival_date_year',
                                                   'adults', 'children',
                                                   'babies', 'booking_changes',
                                                   'previous_cancellations',
                                                   'is_repeated_guest'])])),
                ('random_forest',
                 RandomForestClassifier(max_depth=5, random_state=0))])
In [73]:
print(f'f1-score: {f1_score(y_test, forest.predict(X_test))}')
f1-score: 0.5523871598500896
In [62]:
forest.predict(observation) # prediction(same as by neural network)
Out[62]:
array([0], dtype=int64)
In [63]:
exp_forest = dx.Explainer(forest, X_train, y_train, label='random_forest')
Preparation of a new explainer is initiated

  -> data              : 107447 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 107447 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : random_forest
  -> predict function  : <function yhat_proba_default at 0x000001E98FA0F820> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0687, mean = 0.37, max = 0.976
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.749, mean = -0.000121, max = 0.913
  -> model_info        : package sklearn

A new explainer has been created!
In [65]:
forest_profile = exp_forest.predict_profile(new_observation = observation)

forest_profile.plot(variables = ['lead_time', 'children', 'babies', 'booking_changes'])
# this model was not so dependent on lead time, as well as on number of children and babies
# number of children > 1 makes cancelation more possible(by neural network it was rather less likely)
# none zero amount of booking changes caused the decrease of the cancelation probability
# number babies didn't have bigger impact on prediction
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 11/11 [00:00<00:00, 19.92it/s]

AdaBoost¶

In [66]:
adaboost = Pipeline(steps = [
               ('preprocessor', preprocessor),
               ('adaboost', AdaBoostClassifier(n_estimators=100, random_state=0))
           ])
adaboost.fit(X_train, y_train)
Out[66]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  OneHotEncoder(),
                                                  ['arrival_date_month',
                                                   'deposit_type',
                                                   'customer_type']),
                                                 ('numeric', StandardScaler(),
                                                  ['lead_time',
                                                   'arrival_date_year',
                                                   'adults', 'children',
                                                   'babies', 'booking_changes',
                                                   'previous_cancellations',
                                                   'is_repeated_guest'])])),
                ('adaboost',
                 AdaBoostClassifier(n_estimators=100, random_state=0))])
In [74]:
print(f'f1-score: {f1_score(y_test, adaboost.predict(X_test))}')
f1-score: 0.5842662168379082
In [67]:
adaboost.predict(observation) # prediction
Out[67]:
array([0], dtype=int64)
In [68]:
exp_adaboost = dx.Explainer(adaboost, X_train, y_train, label='adaboost')
Preparation of a new explainer is initiated

  -> data              : 107447 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 107447 values
  -> model_class       : sklearn.ensemble._weight_boosting.AdaBoostClassifier (default)
  -> label             : adaboost
  -> predict function  : <function yhat_proba_default at 0x000001E98FA0F820> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.484, mean = 0.5, max = 0.587
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.513, mean = -0.129, max = 0.514
  -> model_info        : package sklearn

A new explainer has been created!
In [71]:
adaboost_profile = exp_adaboost.predict_profile(new_observation = observation)

adaboost_profile.plot(variables = ['lead_time', 'children', 'babies', 'booking_changes'])
# here only number of children > 6 mattered(making cancelation more likely)
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 11/11 [00:00<00:00, 16.05it/s]

Summary¶

In general models demonstrated similar behavior(bigger lead time means increase and bigger number of booking changes means decrease of the cancelation probability) and had similar significant variables. Neural network however demonstrated a bit more complex decision changes(probility went severak times up or down depending on variables values). Of course the results would be more reliable if models were better trained(but hyperparameters tuning is out of scope for this homework).